The file BikeDay.Rdata contains information on the
bike-sharing rental service in Washington D.C., USA, corresponding to
years 2011 and 2012. This file contains only one data frame,
day, with 731 rows (one for each day of years 2011 and
2012, that was a leap year) and 16 columns:
instant row index, going from 1 to 731dteday dateseason (1:spring, 2:summer, 3:fall, 4:winter)yr year (0: 2011, 1:2012)mnth 1 for January, until 12 for Decemberholiday weather day is holiday or notweekday day of the week (0 Sunday to 6 Saturday)workingday if day is neither weekend nor holiday is 1,
otherwise is 0weathersit:temp Normalized temperature in Celsius. The values are
divided to 41 (max)atemp Normalized feeling temperature in Celsius The
values are divided to 50 (max)hum Normalized humidity. The values are divided to 100
(max)windspeed Normalized wind speed. The values are divided
to 67 (max)casual count of rental bikes by causal users (not
registered)registered count of rental bikes by registered
users.cnt count of total rental bikes (casual +
registered)In particular we are interested in the joint distribution of
temp and casual for year 2012:
plot(day$cnt, type="l", col=8) # timeseries plot
points(day$cnt, col=day$season, pch=19, cex=.5)
plot(day$temp, type="l")
plot(day$temp, day$cnt)
plot(day$temp, day$cnt, col=day$yr+1)
selected <- which((day$season==2) &(day$yr==1))
plot(day$registered[selected], type="l", col=8,
main="Spring 2012", ylab="Registered Users", xlab="Day")
points(day$registered[selected], col=day$workingday[selected]+1, pch=19)
boxplot(day$registered~day$weekday)
mclust for the following task. Do a model
based clustering of these data assuming a Gaussian Mixture Model,
allowing varying volume, shape, and orientation for different components
in the mixture. Choose \(k_{BIC}\), the
best number of clusters \(k\in\{2,\ldots,6\}\) according to
BIC.Mclust (do 4 different
graphics: BIC, classification,
uncertainty and density).load("./BikeDay.Rdata")
X <- as.matrix(day[day$yr==1,c(10,14)])
rng=2:6
GMM <- Mclust(X,G=rng, modelNames = "VVV")
# GMM_BIC <- mclustBIC(X,G=rng, modelNames = "VVV")
plot(GMM, what="BIC")
plot(GMM, what="classification")
plot(GMM, what="uncertainty")
plot(GMM, what="density")
#points(X)
k=rng[which.max(GMM$BIC)]
The best number of clusters with respect the BIC criteria is \(k=3\)
density plot with the
non-parametric density estimation of
(temp,casual) obtained when using the kernel
estimator implemented in sm::sm.density, with the
bandwidths proportional to the standard deviations in both dimensions:
\(h=a\cdot(StdDev(\mathtt{temp}),StdDev(\mathtt{casual}))\).
Use a=0.25.To compare the density functions we will plot both of them in 3d
# the idea is to use the 3d plots to see/show clearly the differences between both densities but still show the plot we are asked for.
# todo: add titles to the plots
mixprobs = GMM$parameters$pro
mixmus = GMM$parameters$mean
mixsigmas = GMM$parameters$variance$sigma
density_mixture_of_multivariate_gaussians <- function(X, means, covariances, probs) {
num_components <- dim(means)[2]
density <- rep(0, nrow(X))
for (i in 1:num_components) {
density <- density + probs[i] * dmvnorm(X, mean = means[,i], sigma = covariances[,,i])
}
return(density)
}
ln = 50
temp_values <- seq(min(X[,"temp"]), max(X[,"temp"]), length.out = ln)
casual_values <- seq(min(X[,"casual"]), max(X[,"casual"]), length.out = ln)
mesh <- expand.grid(temp = temp_values, casual = casual_values)
mesh$mix_dens <- density_mixture_of_multivariate_gaussians(as.matrix(mesh), mixmus, mixsigmas, mixprobs)
mix_dens_matrix <- matrix(mesh$mix_dens, nrow = ln, ncol = ln, byrow = FALSE)
persp(temp_values, casual_values, mix_dens_matrix, col = 2,
xlab = "temp", ylab = "casual", zlab = "", main = "Density estimation using a mixture of gaussians", theta = 160, phi = 30, box=TRUE, axes=T, ticktype="detailed", cex=0.1, xlim = range(X[, "temp"]), ylim = range(X[, "casual"]), zlim = c(0,0.005))
#Todo: write comparison
par(mfrow = c(1, 2))
h = 0.25 * c(sd(X[, "temp"]), sd(X[, "casual"]))
# plot(X)
# sm.density(X, h=h, display="slice", col=2, add=TRUE)
mesh$np_dens <- sm.density(X,h=h,display="none",eval.grid=FALSE, eval.points=mesh[, c("temp", "casual")])$estimate
np_dens_matrix <- matrix(mesh$np_dens, nrow = ln, ncol = ln, byrow = FALSE)
persp(temp_values, casual_values, np_dens_matrix, col = 2, xlab = "temp", ylab = "casual", zlab = "", main = "Non-parametric density estimation", theta = 160, phi = 30, box=TRUE, axes=T, ticktype="detailed", cex=0.1, xlim = range(X[, "temp"]), ylim = range(X[, "casual"]), zlim = c(0,0.005))
#sm.density(X, h=h, display="persp", col=2, theta = 160, phi = 30, main="bradfsd", cex=0.1, xlim = range(X[, "temp"]), ylim = range(X[, "casual"]), zlim = c(0,0.005), zlab="")
#title("Non-parametric density estimation")
plot(X)
sm.density(X, h=h, display="slice", col=2, add=TRUE, props = c(5,10,25,50,75))
plot(GMM_BIC, what=“BIC”)
temp
and casual, conditional to this cluster, using the kernel
estimator implemented in sm::sm.density with the bandwidths
proportional to the standard deviations in both dimensions: \(h=a\cdot(StdDev(\mathtt{temp}),StdDev(\mathtt{casual}))\).
Use \(a=0.4\) and compute the
standard deviations at each cluster.# scm=summary(mclustBIC(X, G=k, modelNames = "VVV"),X)
# indexes<- scm$classification
# alternatively,
indexes <- GMM$classification
plot(X)
for (j in 1:k){
cluster_j <- (indexes==j)
h = 0.4 * c(sd(X[cluster_j, "temp"]), sd(X[cluster_j, "casual"]))
sm.density(X[cluster_j,],h=h,
display="slice",
col=j,props=c(75), cex=4, add=TRUE)
}
fpc to check if it is possible to merge
some of the components in the Gaussian Mixture Model previously
estimated. Let \(k^*\) be the final
number of clusters after the merging process. Do the scatterplot of
(temp,casual) with colors according to the new
\(k^*\) clusters.mergenormals with the
option method="bhat".scm=summary(mclustBIC(X, G=k, modelNames = "VVV"),X)
fpc_mix<- fpc::mergenormals(X,method="bhat",scm)
Y=as.data.frame(X)
ggplot(Y,aes(x=temp,y=casual,colour=as.factor(fpc_mix$clustering)))+geom_point()
temp,casual), conditional to this cluster,
using the kernel estimator implemented in sm::sm.density
with the bandwidths proportional to the standard deviations in both
dimensions: \(h=a\cdot(StdDev(\mathtt{temp}),StdDev(\mathtt{casual}))\).
Use \(a=0.4\) and compute the
standard deviations at each cluster.indexes<- fpc_mix$clustering
plot(X)
for (j in 1:length(fpc_mix$clusternumbers)){
cluster_j <- (indexes==j)
h = 0.4 * c(sd(X[cluster_j, "temp"]), sd(X[cluster_j, "casual"]))
sm.density(X[cluster_j,],h=h,
display="slice",
col=j,props=c(75), cex=4, add=TRUE)
}
temp,casual), after centering and
scaling both variables (do Xs <- scale(X)). Try
\(\varepsilon\in\{0.25,0.5\}\) and
\(\texttt{minPts}\in\{10,15,20\}\).
Which combination of the tuning parameters do you consider the best
one?mergenormals
(print their cross-table).Xs <- scale(X)
# todo: remove all plots maybe or show only final one for each of them?
epsilons <- c(0.25, 0.5)
minPts_values <- c(10, 15, 20)
results <- list()
# perform DBSCAN clustering for different parameter combinations
for (eps in epsilons) {
for (minPts in minPts_values) {
dbscan_result <- dbscan(Xs, eps = eps, MinPts = minPts, showplot = 1)
dbscan_result <- dbscan(Xs, eps = eps, MinPts = minPts , showplot = 1)
results[[paste("Eps_", eps,"_MinPts_", minPts, sep = "")]] <- dbscan_result
}
}
results
## $Eps_0.25_MinPts_10
## dbscan Pts=366 MinPts=10 eps=0.25
## 0 1
## border 105 54
## seed 0 207
## total 105 261
##
## $Eps_0.25_MinPts_15
## dbscan Pts=366 MinPts=15 eps=0.25
## 0 1 2 3
## border 135 34 14 44
## seed 0 76 1 62
## total 135 110 15 106
##
## $Eps_0.25_MinPts_20
## dbscan Pts=366 MinPts=20 eps=0.25
## 0 1 2 3
## border 178 42 32 27
## seed 0 62 6 19
## total 178 104 38 46
##
## $Eps_0.5_MinPts_10
## dbscan Pts=366 MinPts=10 eps=0.5
## 0 1 2
## border 7 9 16
## seed 0 300 34
## total 7 309 50
##
## $Eps_0.5_MinPts_15
## dbscan Pts=366 MinPts=15 eps=0.5
## 0 1 2
## border 17 18 27
## seed 0 289 15
## total 17 307 42
##
## $Eps_0.5_MinPts_20
## dbscan Pts=366 MinPts=20 eps=0.5
## 0 1
## border 59 35
## seed 0 272
## total 59 307
# border = border points, seed = noise, total = core points
Choosing best model based on looking on the plots: best fit seems to be with \(\texttt{eps}=0.5\) and \(\texttt{minPts}=15\).
best_dbscan_result <- results[["Eps_0.5_MinPts_15"]]
GMM_BIC <- mclustBIC(X,G=2:6, modelNames = "VVV")
GMM_BIC_summary <- summary(GMM_BIC, X)
mergenormals_result <- mergenormals(Xs, GMM_BIC_summary, method="bhat")
# mergenormals_result$clustering
cross_table <- table(best_dbscan_result$cluster, mergenormals_result$clustering)
print(cross_table)
##
## 1 2
## 0 0 17
## 1 306 1
## 2 0 42